---
title: "Passion Dashboard"
author: "Rémi Thériault"
output:
  flexdashboard::flex_dashboard:
    orientation: columns
    vertical_layout: fill
    social: menu
    source_code: embed
    # theme: lumen
    storyboard: false
params:
  API_TOKEN_PUBMED: API_TOKEN_PUBMED
---

```{r setup, include=FALSE}
library(flexdashboard)
query_pubmed <- TRUE
```

```{r packages}
# Load packages
library(extrafont)
library(easyPubMed)
library(dplyr)
library(purrr)
library(stringr)
library(stringi)
library(DT)
library(fuzzyjoin)
library(countrycode)
library(Ecfun)
library(tidyr)
library(waffle)
library(plotly)
library(ggplot2)
library(ggflags)
library(knitr)
library(lubridate)
library(xts)
library(tibble)
library(dygraphs)
library(rempsyc)
library(RColorBrewer)
library(tools)

```

```{r API_TOKEN_PUBMED, eval=query_pubmed}
API_TOKEN_PUBMED <- keyring::key_get("pubmed", "rempsyc")
# API_TOKEN_PUBMED <- Sys.getenv("API_TOKEN_PUBMED")
# API_TOKEN_PUBMED <- params$API_TOKEN_PUBMED

if (API_TOKEN_PUBMED == "") stop("API_TOKEN_PUBMED is an empty string. Terminating workflow.")

if (nchar(API_TOKEN_PUBMED) != 36) stop("API_TOKEN_PUBMED is not 36 characters-long. Terminating workflow.")

```

```{r batch_pubmed_download, results='hide', eval=query_pubmed}
# year_low <- 1940
# year_high <- 1949
# year_low <- 1950
# year_high <- 1959
# year_low <- 1960
# year_high <- 1969
# year_low <- 1970
# year_high <- 1979
# -----------------
# year_low <- 1980
# year_high <- 1989
# year_low <- 1990
# year_high <- 1999
# year_low <- 2000
# year_high <- 2009
year_low <- 2010
year_high <- 2019

# Download data
d.fls <- batch_pubmed_download(
  pubmed_query_string = paste(
    "passion [Title/Abstract]",
    "OR Dualistic Model of Passion [Text Word]",
    paste0("AND ('", year_low, "/01/01'[Date - Publication] : '", year_high, "/12/31'[Date - Publication])")
  ),
  dest_file_prefix = "data/easyPubMed_data_",
  api_key = API_TOKEN_PUBMED,
  batch_size = 5000)

```

```{r all_articles_to_df, eval=query_pubmed}
# Convert XLM data to a data frame of first authors
# articles.df <- table_articles_byAuth(d.fls, included_authors = "first")
all_articles_to_df <- function(d.fls){
  y <- lapply(seq_along(d.fls), function(x) {
    list.articles <- articles_to_list(d.fls[x])
    list.articles.df <- lapply(list.articles, article_to_df)
    articles.df <- do.call(rbind, list.articles.df)
  })
  do.call(rbind, y)
}

articles.df <- all_articles_to_df(d.fls)

articles.df <- articles.df %>% 
  filter(!duplicated(pmid))

```

```{r address_split, message=FALSE, warning=FALSE, eval=query_pubmed}
# Split address in university and department
addr.split <- str_split(articles.df$address, ",")

split_address <- function(addr.split, string) {
  ind1 <- map(addr.split, ~which(grepl(string, .x)))
  ind1 <- imap(addr.split, \(x, idx) pluck(ind1[[idx]])[1])
  ind1 <- imap(addr.split, \(x, idx) pluck(x, ind1[[idx]]))
  ind1 <- trimws(as.character(ind1))
  ind1 <- replace(ind1, ind1 == "NULL", NA)
}

string.dep <- "Department|Faculty|Center|School|Unit|Institute|Centre|Division|Unidad"
dep <- split_address(addr.split, string.dep)

string.uni <- "University"
uni <- split_address(addr.split, string.uni)

string.uni2 <- "University|College|School|Institute|Center|Centre|CEMIC, CONICET|CNRS|INSEAD"
uni2 <- split_address(addr.split, string.uni2)
uni3 <- ifelse(!is.na(uni), uni, uni2)

articles.df <- articles.df %>%
  mutate(department = dep,
         university = uni3)

articles.df <- articles.df %>%
  select(journal, year, university, department, address, lastname, firstname,
         month, day, jabbrv, title, doi, pmid, abstract)
```

```{r correct_universities, eval=query_pubmed}
# Correct a few university names manually
articles.df <- articles.df %>%
  mutate(university = case_when(
    university == "Stony Brook University" ~ "University of Colorado at Colorado Springs",
    university == "Technion-Israel Institute of Technology" ~ "Technion - Israel Institute of Technology",
    university == "University of Montreal" ~ "Université de Montréal",
    university == "Philipps-University of Marburg" ~ "Phillips-Universität Marburg",
    university == "University of Wisconsin-Madison" ~ "University of Wisconsin - Madison",
    TRUE ~ university
  ))

```

```{r countries_data, eval=query_pubmed}
# Download university + country data
countries <- read.csv(
  "https://raw.githubusercontent.com/endSly/world-universities-csv/master/world-universities.csv",
  header = FALSE)
names(countries) <- c("country_code", "university", "website")
countries <- countries[1:2]

# Correct/add a few university countries manually
countries <- countries %>% 
  mutate(
    country_code = replace(country_code, 
                           university == "University of the Netherlands Antilles, Curacao", "CW"),
    country_code = replace(country_code, 
                           university == "University of Sint Eustatius School of Medicine", "SX"),
    country_code = replace(country_code, 
                           university == "St.James's School of Medicine, Bonaire" |
                             university == "American University of the Caribbean, Sint Maarten" |
                             university == "International University School of Medicine (IUSOM)", "BQ")) %>% 
  add_row(country_code = "US", university = "University of Colorado") %>%
  add_row(country_code = "US", university = "Stony Brook University") %>%
  add_row(country_code = "GB", university = "York St John University") %>%
  add_row(country_code = "DE", university = "Max Planck Institute for Human Development") %>%
  add_row(country_code = "AR", university = "CEMIC, CONICET") %>%
  add_row(country_code = "VN", university = "SWPS University of Social Sciences and Humanities") %>%
  add_row(country_code = "SG", university = "Yale-NUS College") %>%
  add_row(country_code = "US", university = "Rutgers Business School-Newark and New Brunswick") %>%
  add_row(country_code = "BR", university = "Institute D'Or for Research and Teaching") %>%
  add_row(country_code = "GB", university = "Moray House School of Education and Sport") %>%
  add_row(country_code = "AU", university = "Institute for Positive Psychology and Education") %>%
  add_row(country_code = "FR", university = "CNRS") %>%
  add_row(country_code = "US", university = "Columbia Business School") %>%
  add_row(country_code = "US", university = "Office of Population Research") %>%
  add_row(country_code = "US", university = "School of Family Life") %>%
  add_row(country_code = "CH", university = "Jacobs Center for Productive Youth Development") %>%
  add_row(country_code = "KR", university = "SKK Graduate School of Business") %>%
  add_row(country_code = "SG", university = "Lee Kong Chian School of Business") %>%
  add_row(country_code = "US", university = "Tepper School of Business") %>%
  add_row(country_code = "US", university = "Stephen M. Ross School of Business") %>%
  add_row(country_code = "US", university = "Fuqua School of Business") %>%
  add_row(country_code = "US", university = "Jones Graduate School of Business") %>%
  add_row(country_code = "US", university = "Questrom School of Business") %>%
  add_row(country_code = "US", university = "Booth School of Business") %>%
  add_row(country_code = "FR", university = "INSEAD") %>% 
  add_row(country_code = "BE", university = "University of Liege") %>%
  add_row(country_code = "KR", university = "Sungkyunkwan University") %>%
  add_row(country_code = "DE", university = "University of Tubingen") %>%
  add_row(country_code = "AU", university = "Melbourne School of Psychological Sciences") %>%
  add_row(country_code = "AT", university = "University of Vienna") %>%
  add_row(country_code = "NL", university = "Vrije Universiteit Amsterdam") %>%
  add_row(country_code = "CA", university = "Montreal Behavioural Medicine Centre (MBMC)") %>%
  add_row(country_code = "GB", university = "Manchester Centre for Health Psychology") %>%
  add_row(country_code = "US", university = "Annenberg School for Communication and Journalism") %>%
  add_row(country_code = "US", university = 
            "National Center for Posttraumatic Stress Disorder at VA Boston Healthcare System") %>%
  add_row(country_code = "US", university = "Cincinnati Children's Hospital Medical Center") %>%
  add_row(country_code = "US", university = "University of Wisconsin")

# Custom function for two-sided vlookup
partial_vlookup <- function(pattern, lookup_vector) {
  out <- map_chr(pattern, \(x) {
    out <- grep(x, lookup_vector, value = TRUE, fixed = TRUE)[1]
    if (is.na(out)) {
      out <- rgrep(lookup_vector, x, value = TRUE, fixed = TRUE)[1]
    }
    out
    })
  out
}

```

```{r partial_vlookup, eval=query_pubmed}
# Match universities and countries
articles.df2 <- articles.df %>%
  mutate(
    university_old = university,
    university = partial_vlookup(university, countries$university),
    university = ifelse(is.na(university), partial_vlookup(address, countries$university), university),
    .after = university)

articles.df3 <- articles.df2 %>%
  left_join(countries, by = "university", multiple = "first") %>% 
  relocate(country_code, .after = year)

```

```{r country_code_conversion, eval=query_pubmed}
list.countries <- c(unique(countryname_dict$country.name.en), "USA", "Korea", "UK")
get_country <- function(x, list.countries) {
  rgrep(list.countries, toTitleCase(tolower(x)), value = TRUE, fixed = TRUE)[1]
}

# Get full name country, continent, and region
articles.df4 <- articles.df3 %>% 
  rowwise() %>% 
  mutate(country = countrycode(country_code, "genc2c", "country.name"),
         country = ifelse(is.na(country), 
                          get_country(address, list.countries), 
                          country),
         country_code = ifelse(is.na(country_code), 
                               countrycode(country, "country.name", "genc2c"), 
                               country_code),
         region = countrycode(country_code, "genc2c", "un.regionsub.name"),
         continent = countrycode(country_code, "genc2c", "continent"),
         continent = case_when(continent == "Americas" ~ region,
                               TRUE ~ continent),
         doi = paste0("https://doi.org/", doi),
         .after = country_code) %>% 
  mutate(date = paste(year, month, day, sep = "-"),
         date = as_date(date)) %>% 
  ungroup()

```

```{r save file}
saveRDS(articles.df4, paste0("data/articles_", year_low, "_", year_high, ".rds"))
```